by William Suzuki

We simulate some data generating processes. Then we fit random forests in samples from the simulated DGP.

#191231 script1
#preilminaries ----------------------
library(tidyverse)
library(randomForest)
library(sp)
library(magick) # to make gifs

Classification

In this session we make a DGP for classification problem. We build a region which is a circle, inside the circle the class is 1 outside the class is 0. The total region is a square of 10 by 10. The circle have center (5,5) and radius 3.

#simulate train test sets -----------------------------
#simulate a dgp for classification problem a circle in the middle of of a square area

#1st 
#simulate a uniform random distribution in 2d
#training set
set.seed(1)
x = runif(n=1000,min=0,max=10)
y = runif(n=1000,min=0,max=10)
df = data.frame(x,y)
z = apply(df,1,function(ii){if((ii[1]-5)^2+(ii[2]-5)^2<=9){1} else 0})
df$z = factor(z)  
str(df)
## 'data.frame':    1000 obs. of  3 variables:
##  $ x: num  2.66 3.72 5.73 9.08 2.02 ...
##  $ y: num  5.31 6.85 3.83 9.55 1.18 ...
##  $ z: Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 2 1 1 ...
#an article to finish:
#https://towardsdatascience.com/learn-classification-with-random-forests-in-r-998d3a734098
#test set
set.seed(2)
x = runif(n=1000,min=0,max=10)
y = runif(n=1000,min=0,max=10)
df_test = data.frame(x,y)
z = apply(df_test,1,function(ii){if((ii[1]-5)^2+(ii[2]-5)^2<=9){1} else 0})
df_test$z = factor(z)  
set.seed(1)
model = randomForest(formula =z~x+y, data=df, ntree=100, mtry=1,nodesize=1)
#if ntry = number of covariates then the random forest is a bagging of trees
model
## 
## Call:
##  randomForest(formula = z ~ x + y, data = df, ntree = 100, mtry = 1,      nodesize = 1) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 2.5%
## Confusion matrix:
##     0   1 class.error
## 0 722  13  0.01768707
## 1  12 253  0.04528302
zh = predict(model,df_test)
(df$z == zh) %>% mean #accuracy
## [1] 0.615
#low accuracy in test data
ggplot() +
  geom_point(data=df, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
  ggtitle("Simulated classification problem")

#training data distribution on 2d.
#create circle gdp's frontier -----------------------
x_1st_part = seq(2,8,by=.1)
x_2nd_part = seq(8,2,by=-.1)
x_coord = c(x_1st_part,x_2nd_part) 
plot(x_coord)

y_coord = c((9-(x_1st_part-5)^2)^.5+5,-(9-(x_2nd_part-5)^2)^.5+5)
plot(y_coord)

plot(x_coord, y_coord)

xym = cbind(x_coord, y_coord)
p = Polygon(xym)
ps = Polygons(list(p),1)
sps = SpatialPolygons(list(ps))
plot(sps)

circle_fortify = fortify(sps)
ggplot() +
  geom_point(data=df, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
  geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
  coord_equal()+
  ggtitle("Simulated classification problem")

#analyze prediction ---------------------------------
x = seq(0,10,by=10/70)
y = seq(0,10,by=10/70)
plot(x,y)

df_sim = expand.grid(x = x, y = y)
df_sim %>% class
## [1] "data.frame"
df_sim %>% dim
## [1] 5041    2
plot(df_sim$x,df_sim$y)

zh_sim = predict(model,df_sim)
df_sim$z = zh_sim
ggplot() +
  geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
  geom_point(data=df_sim, aes(y=y, x=x, color=z),size=5,alpha=0.7)+
  ggtitle("Simulated ntree=100")

#there are some rough edges...
#this model was built with ntree=100, mtry=1,nodesize=1
#function of graph and accuracy ---------------
plot_rf = function(model, ntree=100,mtry=2,nodesize=1,graph=TRUE){
  output = list()
  set.seed(1)
  model = randomForest(formula =z~x+y, data=df, ntree=ntree, mtry=mtry, nodesize=nodesize)
  zh = predict(model,df_test)
  accuracy = mean(df_test$z==zh) #accuracy
  zh_sim = predict(model,df_sim)
  df_sim$z = zh_sim
  if(graph==TRUE){
    plot_ggplot = ggplot() +
      geom_point(data=df_sim, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
      geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
      ggtitle(paste0("Simulated ",'ntree=',ntree,";nodesize=",nodesize,";accuracy=", accuracy))
    output[[1]] = plot_ggplot
  }
  output[[2]] = accuracy
  output
}

plot_rf(model,ntree=100,mtry=1,nodesize=1)[[1]]

#this is the cast of the bagging model
plot_rf(model,ntree=100,mtry=2,nodesize=1)
## [[1]]

## 
## [[2]]
## [1] 0.978

The only difference between this graph and the previous one is mtry. Here mtry=2, before mtry=1. The only difference between a Random Forest and Bagging of Trees is the quantity of variables that are used in the each step of tree building is smaller.

In our case we have only 2 variables for the algorithm to choose. But in cases where there are many more variables, limiting and randomly assigning variables for the random forest to choose improves the model’s prediction out of sample. That is, restrincting the algorithm’s choice decrease variance.

# random forest with just 1 tree -------------------
plot_rf(model,ntree=1,mtry=2,nodesize=1)
## [[1]]

## 
## [[2]]
## [1] 0.963

When we use 1 tree the model performs worse, as expected the edges are rougher this is a full tree but with a bootstrap sample the tree is grown till it reacher a nodesize of 1.

number of trees

#graph showing how accuracy changes with ntree -----------------
tree_seq = c(1:50,seq(50,300,by=5))
len1 = length(tree_seq)
accuracy_seq = rep(NA,len1)
for (ii in 1:len1) {
  ntree = tree_seq[ii]
  accuracy = plot_rf(model,ntree=ntree,mtry=1,nodesize=1,graph=FALSE)[[2]]
  accuracy_seq[ii] = accuracy
}

accuracy_seq_bag = rep(NA,len1)
for (ii in 1:len1) {
  ntree = tree_seq[ii]
  accuracy = plot_rf(model,ntree=ntree,mtry=2,nodesize=1,graph=FALSE)[[2]]
  accuracy_seq_bag[ii] = accuracy
}

plot(tree_seq,accuracy_seq, type='l')
lines(tree_seq,accuracy_seq_bag,col='blue')

Using mtry=1 is better as expected, random forests have in general better performance compared to bagging, in the test set the black line is mtry=1, blue line is mtry=2.

#animations ntree -------------------
#how fit changes with number of tree 
tree_seq_gif = seq(1,30,by=1)
len2 = length(tree_seq_gif)
ii = 1
for (ii in 1:len2) {
  ntree = tree_seq_gif[ii]
  plot_rf(model,ntree=ntree,mtry=1,nodesize=1)[[1]]
  ggsave(paste0("images/ntree/","ntree",100+ntree,".jpg"), width = 25, height = 25, units = "cm",dpi = 60)
}


#to create gifs
library(magick) # to make gifs
list.files(path = "./images/ntree", pattern = "*.jpg", full.names = T) %>% 
  map(image_read) %>% # reads each path file
  image_join() %>% # joins image
  image_animate(fps=2) %>% # animates, can opt for number of loops
  image_write("ntree_change.gif") # write to current dir

nodesize

#graph accuracy change nodesize --------------------------------
nodesize_seq =c(seq(380,100,by=-20),seq(100,50,by=-10),seq(50,20,by=-5),seq(20,1,by=-1))  
len2 = length(nodesize_seq)
accuracy_seq = rep(NA,len2)
for (ii in 1:len2) {
  nodesize = nodesize_seq[ii]
  accuracy = plot_rf(model,ntree=100,mtry=1,nodesize=nodesize,graph=FALSE)[[2]]
  accuracy_seq[ii] = accuracy
}
plot(nodesize_seq,accuracy_seq, type='l',xlim = rev(range(nodesize_seq)))

xmin = 1
xmax = 50
ymin = .958
ymax = .99
plot(nodesize_seq,accuracy_seq, type='l',
     xlim = rev(c(xmin,xmax)), ylim=c(ymin, ymax))

#animation for changing nodesize ---------------------------
#smaller node size means deeper trees, hence large nodesize is smaller tree
#then less accuracy
for (ii in 1:len2) {
  nodesize = nodesize_seq[ii]
  plot_rf(model,ntree=50,mtry=1,nodesize=nodesize)[[1]]
  ggsave(paste0("images/nodesize/","nodesize",100+ii,".jpg"), width = 25, height = 25, units = "cm",dpi = 60)
}

#to create gifs
list.files(path = "./images/nodesize", pattern = "*.jpg", full.names = T) %>% 
  map(image_read) %>% # reads each path file
  image_join() %>% # joins image
  image_animate(fps=2) %>% # animates, can opt for number of loops
  image_write("nodesize_change.gif") # write to current dir

#animation changing traning set size

Regression

In this session we build a function with one covariate for the random forest algorithm to fit.

#polynominal function in 2 dimensions ----------------------
#true process
set.seed(1)
x = seq(0,10,by=.05)
y = 1 + 1*x + .2*x^2 - .01*x^3 +10*sin(x)
df_sim = data.frame(y,x)
plot(x,y,type='l')

#sample of tree process
#traning set
set.seed(1)
x_train = runif(1000,0,10)
x = x_train
len_x = length(x)
y = 1 + 1*x + .2*x^2 - .01*x^3 +10*sin(x) + rnorm(len_x,0,2)
df_train = data.frame(y,x)
plot(x,y)

This is the training sample.

#test set
set.seed(2)
x_test = runif(500,0,10)
x = x_test
len_x = length(x)
y = 1 + 1*x + .2*x^2 - .01*x^3 +10*sin(x) + rnorm(len_x,0,2)
df_test = data.frame(y,x)
plot(x,y)

This is the test sample.

RMSE

plot_rf = function(model, ntree=100,mtry=1,nodesize=1,graph=TRUE){
  output = list()
  set.seed(1)
  model = randomForest(formula=y~x, data=df_train, ntree=ntree, mtry=mtry, nodesize=nodesize)
  yh = predict(model,df_test)
  rmse = mean((df_test$y-yh)^2)^.5  #rmse
  yh_sim = predict(model,df_sim)
  df_sim$yh = yh_sim
  if(graph==TRUE){
    plot_ggplot = ggplot() +
      geom_line(data=df_sim, aes(x=x, y=y),color="black")+
      geom_line(data=df_sim, aes(x=x, y=yh),color="red")+
      ggtitle(paste0('ntree=',ntree,"; nodesize=",nodesize,"; RMSE=", round(rmse,2)))
    output[[1]] = plot_ggplot
  }
  output[[2]] = accuracy
  output
}
plot_rf(model,ntree=100,mtry=1,nodesize=1)[[1]]

plot_rf(model,ntree=100,mtry=1,nodesize=200)[[1]]

#rmse droped with increasing nodesize
#the shape is more box-like, when we increase nodesize
plot_rf(model,ntree=1,mtry=1,nodesize=1)[[1]]

#it seems that there is not much impact of ntree on shape and rmse
plot_rf(model,ntree=1,mtry=1,nodesize=200)[[1]]

#but nodesize has strong impact, althogh rmse 

The above graphs show how the random forest fit changes with nodesize.

sample size

#increase training sample --------------------------------------------

plot_rf_sample = function(model, ntree=100,mtry=1,nodesize=1,sample=1000,graph=TRUE){
  output = list()
  set.seed(1)
  x_train = runif(sample,0,10)
  x = x_train
  len_x = length(x)
  y = 1 + 1*x + .2*x^2 - .01*x^3 +10*sin(x) + rnorm(len_x,0,2)
  df_train = data.frame(y,x)
  model = randomForest(formula=y~x, data=df_train, ntree=ntree, mtry=mtry, nodesize=nodesize)
  yh = predict(model,df_test)
  rmse = mean((df_test$y-yh)^2)^.5  #rmse
  yh_sim = predict(model,df_sim)
  df_sim$yh = yh_sim
  if(graph==TRUE){
    plot_ggplot = ggplot() +
      geom_line(data=df_sim, aes(x=x, y=y),color="black")+
      geom_line(data=df_sim, aes(x=x, y=yh),color="red")+
      xlim(0, 10)+
      ylim(-5, 30)+
      ggtitle(paste0('ntree=',ntree,"; nodesize=",nodesize,"; sample=",sample,"; RMSE=", round(rmse,2)))
    output[[1]] = plot_ggplot
  }
  output[[2]] = rmse
  output
}
plot_rf_sample(model, ntree=100,mtry=1,nodesize=1,sample=1000,graph=TRUE)[[1]]

plot_rf_sample(model, ntree=100,mtry=1,nodesize=1,sample=10,graph=TRUE)[[1]]

Note how the graph changes given a smaller sample of traning.

#animations
sample_size =c(seq(1,30,by=1), seq(30,80,by=5),seq(100,200,by=20),seq(250,500,by=50),seq(600,1000,by=100),seq(1000,5000,by=1000))
len2 = length(sample_size)
rmse_vec = rep(NA,len2)
for (ii in 1:len2) {
  sample = sample_size[ii]
  output1 = plot_rf_sample(model, ntree=100,mtry=1,nodesize=1,sample=sample,graph=TRUE)
  rmse_vec[ii] = output1[[2]]
  output1[[1]]
  ggsave(paste0("images/sample/","sample",1000+ii,".jpg"), width = 30, height = 22, units = "cm",dpi = 60)
}


#to create gifs
list.files(path = "./images/sample", pattern = "*.jpg", full.names = T) %>% 
  map(image_read) %>% # reads each path file
  image_join() %>% # joins image
  image_animate(fps=2) %>% # animates, can opt for number of loops
  image_write("sample_change.gif") # write to current dir

sample_size =c(seq(1,30,by=1), seq(30,80,by=5),seq(100,200,by=20),seq(250,500,by=50),seq(600,1000,by=100))
len2 = length(sample_size)
rmse_vec = rep(NA,len2)
for (ii in 1:len2) {
  sample = sample_size[ii]
  output1 = plot_rf_sample(model, ntree=100,mtry=1,nodesize=1,sample=sample,graph=FALSE)
  rmse_vec[ii] = output1[[2]]
}
plot(sample_size,rmse_vec, type='l')

xmin = 50
xmax = 1000
ymin = 2.35
ymax = 2.65
plot(sample_size,rmse_vec, type='l',
     xlim = c(xmin,xmax), ylim=c(ymin, ymax))

Concluding Remarks

Next we’ll explore how the behaviour of those models are uncovered by methods of explanation of black-box models.